tarsec 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/parsers/markdown/blocks.d.ts +14 -0
- package/dist/parsers/markdown/blocks.js +189 -0
- package/dist/parsers/markdown/frontmatter.d.ts +22 -0
- package/dist/parsers/markdown/frontmatter.js +80 -0
- package/dist/parsers/markdown/index.d.ts +7 -0
- package/dist/parsers/markdown/index.js +29 -0
- package/dist/parsers/markdown/inline.d.ts +48 -0
- package/dist/parsers/markdown/inline.js +249 -0
- package/dist/parsers/markdown/references.d.ts +5 -0
- package/dist/parsers/markdown/references.js +96 -0
- package/dist/parsers/markdown/types.d.ts +125 -0
- package/dist/parsers/markdown/types.js +2 -0
- package/package.json +6 -1
package/README.md
CHANGED
|
@@ -48,7 +48,7 @@ parser("hello there"); // failure
|
|
|
48
48
|
- [Pretty error messages](/tutorials/pretty-errors.md)
|
|
49
49
|
|
|
50
50
|
## Examples
|
|
51
|
-
- [A markdown parser](/
|
|
51
|
+
- [A CommonMark-ish markdown parser](/lib/parsers/markdown) — importable as `tarsec/parsers/markdown`. Supports headings (ATX 1–6 with optional trailing `#` stripping, plus setext), fenced and indented code blocks, multi-backtick inline code spans, multi-line / nested block quotes, ordered / unordered / nested lists, pipe tables with alignment, horizontal rules, HTML passthrough, VitePress-style YAML frontmatter, plus inline bold/italic (`*` and `_`), combined `***bold-italic***`, strikethrough, escapes, autolinks, hard *and* soft line breaks, images and links with optional `"title"`, and reference-style links / footnotes resolved in a post-parse pass. Paragraphs round-trip soft-wrapped lines through an `inline-soft-break` node. Inline emphasis, strike, and link content all nest, so ``**[link](u)**`` and ``*a `code` b*`` round-trip into the AST.
|
|
52
52
|
|
|
53
53
|
Read more about [use cases for tarsec](/tutorials/use-case.md).
|
|
54
54
|
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { Parser } from "../../types.js";
|
|
2
|
+
import { Heading, CodeBlock, BlockQuote, Paragraph, HorizontalRule, List, Table, HTMLBlock } from "./types.js";
|
|
3
|
+
export { imageParser } from "./inline.js";
|
|
4
|
+
export declare const headingParser: Parser<Heading>;
|
|
5
|
+
export declare const codeBlockParser: Parser<CodeBlock>;
|
|
6
|
+
export declare const blockQuoteParser: Parser<BlockQuote>;
|
|
7
|
+
export declare const indentedCodeBlockParser: Parser<CodeBlock>;
|
|
8
|
+
export declare const setextHeadingParser: Parser<Heading>;
|
|
9
|
+
export declare const listParser: Parser<List>;
|
|
10
|
+
export declare const blankLine: Parser<unknown>;
|
|
11
|
+
export declare const htmlBlockParser: Parser<HTMLBlock>;
|
|
12
|
+
export declare const tableParser: Parser<Table>;
|
|
13
|
+
export declare const horizontalRuleParser: Parser<HorizontalRule>;
|
|
14
|
+
export declare const paragraphParser: Parser<Paragraph>;
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
import { seqC, seqR, capture, optional, or, manyTillStr, many1Till, exactly, many, many1, many1WithJoin, map, not, lazy, } from "../../combinators.js";
|
|
2
|
+
import { str, spaces, char, eof, set, alphanum, oneOf, noneOf, } from "../../parsers.js";
|
|
3
|
+
import { digit, letter } from "../../parsers.js";
|
|
4
|
+
import { manyTill } from "../../combinators.js";
|
|
5
|
+
import { inlineMarkdownParser, softBreakParser } from "./inline.js";
|
|
6
|
+
export { imageParser } from "./inline.js";
|
|
7
|
+
const languageChar = or(alphanum, oneOf("_+#.-"));
|
|
8
|
+
const languageTag = many1WithJoin(languageChar);
|
|
9
|
+
/* ATX heading marker: 1–6 consecutive `#`, not followed by another `#`.
|
|
10
|
+
* Try widest first so `###` doesn't parse as level 1 and leave `##` behind.
|
|
11
|
+
* `not(char("#"))` rejects 7+ `#` runs (they fall through to a paragraph). */
|
|
12
|
+
const atxMarker = or(...[6, 5, 4, 3, 2, 1].map((n) => map(seqR(exactly(n, char("#")), not(char("#"))), () => n)));
|
|
13
|
+
/* An optional trailing run of `#`s on an ATX heading: at least one separating
|
|
14
|
+
* space, one or more `#`, optional trailing spaces, then end-of-line. */
|
|
15
|
+
const trailingHashRun = seqR(many1(char(" ")), many1(char("#")), many(char(" ")), or(char("\n"), eof));
|
|
16
|
+
/* The heading body — everything up to (but not including) either the line end
|
|
17
|
+
* or a trailing `#` run. We capture this as a raw string then re-parse it as
|
|
18
|
+
* inline markdown so the body shape matches ATX/setext headings. */
|
|
19
|
+
const headingBody = many1Till(or(char("\n"), trailingHashRun));
|
|
20
|
+
export const headingParser = map(seqC(capture(atxMarker, "level"), spaces, capture(headingBody, "body"), optional(trailingHashRun), optional(char("\n"))), ({ level, body }) => {
|
|
21
|
+
const inner = many1(inlineMarkdownParser)(body);
|
|
22
|
+
return {
|
|
23
|
+
type: "heading",
|
|
24
|
+
level: level,
|
|
25
|
+
content: inner.success
|
|
26
|
+
? inner.result
|
|
27
|
+
: [{ type: "inline-text", content: body }],
|
|
28
|
+
};
|
|
29
|
+
});
|
|
30
|
+
export const codeBlockParser = seqC(set("type", "code-block"), str("```"), capture(optional(languageTag), "language"), optional(spaces), capture(manyTillStr("```"), "content"), str("```"));
|
|
31
|
+
/* Multi-line and nested block quotes.
|
|
32
|
+
*
|
|
33
|
+
* - Consume consecutive lines beginning with "> " (the space is optional).
|
|
34
|
+
* - Join their stripped content with newlines.
|
|
35
|
+
* - Recursively re-parse the inner text: a sub-blockquote OR inline markdown.
|
|
36
|
+
*
|
|
37
|
+
* `lazy` defers the self-reference so we can recurse for nesting. */
|
|
38
|
+
const blockQuoteLine = map(seqC(char(">"), optional(char(" ")), capture(manyTillStr("\n"), "line"), or(char("\n"), eof)), ({ line }) => line);
|
|
39
|
+
// Inside the joined inner text, accept either a nested blockquote (possibly
|
|
40
|
+
// after a leading newline), a soft newline between lines, or any inline node.
|
|
41
|
+
const softNewline = map(char("\n"), () => ({ type: "inline-text", content: " " }));
|
|
42
|
+
const nestedBlockQuote = lazy(() => map(seqC(many(char("\n")), capture(blockQuoteParser, "quote")), ({ quote }) => quote));
|
|
43
|
+
const blockQuoteContent = or(nestedBlockQuote, softNewline, inlineMarkdownParser);
|
|
44
|
+
// Re-parse the joined inner text as a sequence of blockquote-content nodes.
|
|
45
|
+
// (We have to round-trip through a string because the `>` prefixes need to be
|
|
46
|
+
// stripped before nested blockquotes can be recognised.)
|
|
47
|
+
const reparseInner = (innerText) => {
|
|
48
|
+
const inner = many1(blockQuoteContent)(innerText);
|
|
49
|
+
return inner.success ? inner.result : [];
|
|
50
|
+
};
|
|
51
|
+
export const blockQuoteParser = map(many1(blockQuoteLine), (lines) => ({
|
|
52
|
+
type: "block-quote",
|
|
53
|
+
content: reparseInner(lines.join("\n")),
|
|
54
|
+
}));
|
|
55
|
+
/* Indented code block: one or more consecutive lines beginning with 4 spaces
|
|
56
|
+
* or a tab. The indent is stripped from each line. */
|
|
57
|
+
const indentPrefix = or(str(" "), char("\t"));
|
|
58
|
+
const indentedLine = map(seqC(indentPrefix, capture(manyTillStr("\n"), "line"), or(char("\n"), eof)), ({ line }) => line + "\n");
|
|
59
|
+
const indentedLines = map(many1(indentedLine), (lines) => lines.join(""));
|
|
60
|
+
export const indentedCodeBlockParser = seqC(set("type", "code-block"), set("language", null), capture(indentedLines, "content"));
|
|
61
|
+
/* Setext-style headings: a line of content followed by an underline of `=`
|
|
62
|
+
* (level 1) or `-` (level 2), terminated by `\n` or end-of-input. We capture
|
|
63
|
+
* the first line as a raw string, then re-parse it as inline markdown so the
|
|
64
|
+
* heading's content has the same shape as ATX headings. */
|
|
65
|
+
const setextLine = many1WithJoin(noneOf("\n"));
|
|
66
|
+
const setextH1Underline = map(many1(char("=")), () => 1);
|
|
67
|
+
const setextH2Underline = map(many1(char("-")), () => 2);
|
|
68
|
+
const _setextRaw = seqC(set("type", "heading"), capture(setextLine, "content"), char("\n"), capture(or(setextH1Underline, setextH2Underline), "level"), or(char("\n"), eof));
|
|
69
|
+
export const setextHeadingParser = map(_setextRaw, (caps) => {
|
|
70
|
+
const inner = many1(inlineMarkdownParser)(caps.content);
|
|
71
|
+
return {
|
|
72
|
+
type: "heading",
|
|
73
|
+
level: caps.level,
|
|
74
|
+
content: inner.success
|
|
75
|
+
? inner.result
|
|
76
|
+
: [{ type: "inline-text", content: caps.content }],
|
|
77
|
+
};
|
|
78
|
+
});
|
|
79
|
+
const unorderedMarker = map(oneOf("-*+"), () => ({ ord: false, start: 1 }));
|
|
80
|
+
const orderedMarker = map(seqC(capture(many1WithJoin(digit), "digits"), char(".")), ({ digits }) => ({ ord: true, start: parseInt(digits, 10) }));
|
|
81
|
+
const indentOf = (n) => n > 0 ? str(" ".repeat(n)) : str("");
|
|
82
|
+
/* GFM task-list checkbox: `[ ]` (unchecked), `[x]` or `[X]` (checked).
|
|
83
|
+
* Must be followed by a single space (consumed) to count as a checkbox. */
|
|
84
|
+
const taskCheckbox = map(seqC(char("["), capture(or(char(" "), char("x"), char("X")), "mark"), str("] ")), ({ mark }) => mark !== " ");
|
|
85
|
+
const itemHeadOf = (indent, markerParser) => map(seqC(indentOf(indent), capture(markerParser, "marker"), char(" "), capture(optional(taskCheckbox), "checked"), capture(manyTillStr("\n"), "line"), or(char("\n"), eof)), ({ marker, checked, line }) => {
|
|
86
|
+
const raw = { marker, line };
|
|
87
|
+
if (checked !== null)
|
|
88
|
+
raw.checked = checked;
|
|
89
|
+
return raw;
|
|
90
|
+
});
|
|
91
|
+
const parseInline = (line) => {
|
|
92
|
+
const inline = many1(inlineMarkdownParser)(line);
|
|
93
|
+
return inline.success ? inline.result : [];
|
|
94
|
+
};
|
|
95
|
+
// One list item: an item-head followed by an optional sublist at +2 indent.
|
|
96
|
+
const itemWithSublist = (indent, markerParser) => map(seqC(capture(itemHeadOf(indent, markerParser), "raw"), capture(optional(lazy(() => listParserAt(indent + 2))), "sublist")), ({ raw, sublist }) => {
|
|
97
|
+
const item = { content: parseInline(raw.line) };
|
|
98
|
+
if (sublist)
|
|
99
|
+
item.sublist = sublist;
|
|
100
|
+
if (raw.checked !== undefined)
|
|
101
|
+
item.checked = raw.checked;
|
|
102
|
+
return { marker: raw.marker, item };
|
|
103
|
+
});
|
|
104
|
+
// A list of one or more items that all share a marker family.
|
|
105
|
+
const listOf = (indent, markerParser) => map(seqC(capture(itemWithSublist(indent, markerParser), "first"), capture(many(itemWithSublist(indent, markerParser)), "rest")), ({ first, rest }) => ({
|
|
106
|
+
type: "list",
|
|
107
|
+
ordered: first.marker.ord,
|
|
108
|
+
start: first.marker.start,
|
|
109
|
+
items: [first.item, ...rest.map((r) => r.item)],
|
|
110
|
+
}));
|
|
111
|
+
const listParserAt = (indent) => or(listOf(indent, unorderedMarker), listOf(indent, orderedMarker));
|
|
112
|
+
export const listParser = listParserAt(0);
|
|
113
|
+
/* Tables.
|
|
114
|
+
*
|
|
115
|
+
* Pipe-delimited GFM-style. A table is:
|
|
116
|
+
*
|
|
117
|
+
* | h1 | h2 | ← header row
|
|
118
|
+
* |----|:--:| ← separator row, with alignment markers
|
|
119
|
+
* | a | b | ← one or more data rows
|
|
120
|
+
*
|
|
121
|
+
* Each cell is `noneOf("|\n")`. We `map` the captured content to `.trim()`
|
|
122
|
+
* so headers/rows aren't padded with spaces. */
|
|
123
|
+
const cellContent = map(many1WithJoin(noneOf("|\n")), (s) => s.trim());
|
|
124
|
+
const cellThenBar = map(seqC(capture(cellContent, "cell"), char("|")), ({ cell }) => cell);
|
|
125
|
+
const tableRow = map(seqC(char("|"), capture(many1(cellThenBar), "cells"), or(char("\n"), eof)), ({ cells }) => cells);
|
|
126
|
+
const sepCell = map(seqC(many(char(" ")), capture(optional(char(":")), "left"), many1(char("-")), capture(optional(char(":")), "right"), many(char(" "))), ({ left, right }) => {
|
|
127
|
+
const leftColon = left !== null;
|
|
128
|
+
const rightColon = right !== null;
|
|
129
|
+
if (leftColon && rightColon)
|
|
130
|
+
return "center";
|
|
131
|
+
if (rightColon)
|
|
132
|
+
return "right";
|
|
133
|
+
if (leftColon)
|
|
134
|
+
return "left";
|
|
135
|
+
return null;
|
|
136
|
+
});
|
|
137
|
+
const sepCellThenBar = map(seqC(capture(sepCell, "cell"), char("|")), ({ cell }) => cell);
|
|
138
|
+
const sepRow = map(seqC(char("|"), capture(many1(sepCellThenBar), "cells"), or(char("\n"), eof)), ({ cells }) => cells);
|
|
139
|
+
/* HTML blocks (passthrough subset).
|
|
140
|
+
*
|
|
141
|
+
* A line starting with `<` followed by a letter, `/`, `!`, or `?` is treated
|
|
142
|
+
* as the start of a raw HTML block. The block extends until the next blank
|
|
143
|
+
* line or end of input. We don't try to balance tags — the content is kept
|
|
144
|
+
* as a single opaque string so downstream renderers can hand it to an HTML
|
|
145
|
+
* renderer untouched. */
|
|
146
|
+
const htmlBlockOpen = seqR(char("<"), or(letter, oneOf("/!?")));
|
|
147
|
+
// "\n" followed by zero or more spaces/tabs followed by another "\n" or end of input.
|
|
148
|
+
export const blankLine = seqR(char("\n"), many(oneOf(" \t")), or(char("\n"), eof));
|
|
149
|
+
// Peek at the opening (`not(not(...))` is a non-consuming lookahead), then
|
|
150
|
+
// consume everything up to the next blank line or eof.
|
|
151
|
+
export const htmlBlockParser = seqC(set("type", "html-block"), not(not(htmlBlockOpen)), capture(manyTill(or(blankLine, eof)), "content"));
|
|
152
|
+
export const tableParser = seqC(set("type", "table"), capture(tableRow, "headers"), capture(sepRow, "alignments"), capture(many1(tableRow), "rows"));
|
|
153
|
+
/* Horizontal rules: three-or-more of the same `-`, `*`, or `_`,
|
|
154
|
+
* with optional spaces between, ending in newline or eof. The "three or
|
|
155
|
+
* more" rule is expressed structurally — three explicit `char(c)`s followed
|
|
156
|
+
* by `many` more — so no count-and-validate wrapper is needed. */
|
|
157
|
+
const hrSpaces = many(char(" "));
|
|
158
|
+
const hrOf = (c) => map(seqR(hrSpaces, char(c), hrSpaces, char(c), hrSpaces, char(c), hrSpaces, many(seqR(char(c), hrSpaces)), or(char("\n"), eof)), () => ({ type: "horizontal-rule" }));
|
|
159
|
+
export const horizontalRuleParser = or(hrOf("-"), hrOf("*"), hrOf("_"));
|
|
160
|
+
// "\n" followed by zero or more spaces/tabs followed by another "\n" or end of input.
|
|
161
|
+
// (`blankLine` is declared near `htmlBlockParser` above, since both need it at
|
|
162
|
+
// module-eval time.)
|
|
163
|
+
/* Block-level constructs that, if they would start at the *current* line
|
|
164
|
+
* position, must interrupt a soft-wrapped paragraph instead of being eaten
|
|
165
|
+
* as inline content. Setext is intentionally excluded — its underline is
|
|
166
|
+
* resolved by `setextHeadingParser` running ahead of `paragraphParser` in
|
|
167
|
+
* the top-level dispatch. */
|
|
168
|
+
const blockInterrupt = or(
|
|
169
|
+
// ATX heading (1–6 `#` then a space)
|
|
170
|
+
seqR(atxMarker, char(" ")),
|
|
171
|
+
// Block quote
|
|
172
|
+
char(">"),
|
|
173
|
+
// Fenced code block
|
|
174
|
+
str("```"),
|
|
175
|
+
// Horizontal rule (3+ of -, *, or _ with optional intervening spaces)
|
|
176
|
+
horizontalRuleParser,
|
|
177
|
+
// List marker (unordered or `<digits>.`) followed by a space
|
|
178
|
+
seqR(or(oneOf("-*+"), seqR(many1(digit), char("."))), char(" ")),
|
|
179
|
+
// Table row
|
|
180
|
+
char("|"),
|
|
181
|
+
// HTML block opener
|
|
182
|
+
seqR(char("<"), or(letter, oneOf("/!?"))));
|
|
183
|
+
/* A paragraph node: an inline node OR a soft line break (single `\n` that
|
|
184
|
+
* isn't the start of a blank line *and* doesn't precede a block opener).
|
|
185
|
+
* Hard breaks (" \n" / "\\\n") win over soft breaks because they're
|
|
186
|
+
* matched earlier inside `inlineMarkdownParser`'s `or`. */
|
|
187
|
+
const paragraphSoftBreak = map(seqR(softBreakParser, not(blockInterrupt)), () => ({ type: "inline-soft-break" }));
|
|
188
|
+
const paragraphInline = map(seqC(not(blankLine), capture(or(paragraphSoftBreak, inlineMarkdownParser), "node")), ({ node }) => node);
|
|
189
|
+
export const paragraphParser = map(many1(paragraphInline), (content) => ({ type: "paragraph", content: content }));
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* YAML frontmatter parser for the Markdown example.
|
|
3
|
+
*
|
|
4
|
+
* Supports the spec from https://vitepress.dev/guide/frontmatter:
|
|
5
|
+
* ---
|
|
6
|
+
* title: Docs with VitePress
|
|
7
|
+
* editLink: true
|
|
8
|
+
* ---
|
|
9
|
+
*
|
|
10
|
+
* YAML coverage is a useful subset (top-level `key: value` only), built from
|
|
11
|
+
* Tarsec combinators per the project's "combinator-first" rule:
|
|
12
|
+
* - scalar values: bare strings, single/double-quoted strings, integers,
|
|
13
|
+
* floats, `true`/`false`, `null`/`~`
|
|
14
|
+
* - inline flow lists: [a, b, "c d"]
|
|
15
|
+
*/
|
|
16
|
+
import { Parser } from "../../types.js";
|
|
17
|
+
import { Frontmatter } from "./types.js";
|
|
18
|
+
/**
|
|
19
|
+
* VitePress-style YAML frontmatter: a `---`-delimited block at the very top
|
|
20
|
+
* of a Markdown file.
|
|
21
|
+
*/
|
|
22
|
+
export declare const frontmatterParser: Parser<Frontmatter>;
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* YAML frontmatter parser for the Markdown example.
|
|
3
|
+
*
|
|
4
|
+
* Supports the spec from https://vitepress.dev/guide/frontmatter:
|
|
5
|
+
* ---
|
|
6
|
+
* title: Docs with VitePress
|
|
7
|
+
* editLink: true
|
|
8
|
+
* ---
|
|
9
|
+
*
|
|
10
|
+
* YAML coverage is a useful subset (top-level `key: value` only), built from
|
|
11
|
+
* Tarsec combinators per the project's "combinator-first" rule:
|
|
12
|
+
* - scalar values: bare strings, single/double-quoted strings, integers,
|
|
13
|
+
* floats, `true`/`false`, `null`/`~`
|
|
14
|
+
* - inline flow lists: [a, b, "c d"]
|
|
15
|
+
*/
|
|
16
|
+
import { capture, many, many1WithJoin, map, optional, or, seqC, seqR, sepBy, } from "../../combinators.js";
|
|
17
|
+
import { alphanum, char, eof, noneOf, oneOf, quotedString, str, } from "../../parsers.js";
|
|
18
|
+
// --- helpers -----------------------------------------------------------------
|
|
19
|
+
const hSpace = oneOf(" \t");
|
|
20
|
+
const hSpaces = many(hSpace);
|
|
21
|
+
const newlineOrEof = or(char("\n"), eof);
|
|
22
|
+
/** Strip the surrounding quote chars (`'`, `"`, or `` ` ``) added by `quotedString`. */
|
|
23
|
+
const stripQuotes = (s) => s.slice(1, -1);
|
|
24
|
+
/**
|
|
25
|
+
* Classify a trimmed bare scalar token into its YAML value.
|
|
26
|
+
* Booleans and null are matched exactly; otherwise we try numeric, else fall
|
|
27
|
+
* back to the raw string.
|
|
28
|
+
*/
|
|
29
|
+
function classifyBare(raw) {
|
|
30
|
+
const s = raw.trim();
|
|
31
|
+
if (s === "true")
|
|
32
|
+
return true;
|
|
33
|
+
if (s === "false")
|
|
34
|
+
return false;
|
|
35
|
+
if (s === "null" || s === "~")
|
|
36
|
+
return null;
|
|
37
|
+
if (s.length > 0) {
|
|
38
|
+
const n = Number(s);
|
|
39
|
+
if (!Number.isNaN(n) && Number.isFinite(n))
|
|
40
|
+
return n;
|
|
41
|
+
}
|
|
42
|
+
return s;
|
|
43
|
+
}
|
|
44
|
+
// --- key ---------------------------------------------------------------------
|
|
45
|
+
// Conservative key chars: letters, digits, underscore, hyphen.
|
|
46
|
+
const keyChar = or(alphanum, oneOf("_-"));
|
|
47
|
+
const yamlKey = many1WithJoin(keyChar);
|
|
48
|
+
// --- scalar values -----------------------------------------------------------
|
|
49
|
+
// Quoted scalar — returns the inner string (quotes stripped).
|
|
50
|
+
const quotedScalar = map(quotedString, stripQuotes);
|
|
51
|
+
// Bare scalar in top-level context: runs to end-of-line.
|
|
52
|
+
const bareValueLine = map(many1WithJoin(noneOf("\n")), classifyBare);
|
|
53
|
+
// Bare scalar inside a flow list `[...]`: ends at `,` or `]` (or `\n`).
|
|
54
|
+
const bareValueInList = map(many1WithJoin(noneOf(",]\n")), classifyBare);
|
|
55
|
+
// One element of a flow list: optional leading whitespace, then quoted or bare.
|
|
56
|
+
const listElement = map(seqC(hSpaces, capture(or(quotedScalar, bareValueInList), "value")), ({ value }) => value);
|
|
57
|
+
// Inline flow list: `[a, b, "c d"]`
|
|
58
|
+
const flowList = map(seqC(char("["), capture(sepBy(char(","), listElement), "items"), hSpaces, char("]")), ({ items }) => items);
|
|
59
|
+
// Any value: prefer flow list, then quoted, then bare.
|
|
60
|
+
const yamlValue = or(flowList, quotedScalar, bareValueLine);
|
|
61
|
+
// --- entries / body ----------------------------------------------------------
|
|
62
|
+
// `key: value` — gap between `:` and value may include horizontal whitespace.
|
|
63
|
+
const yamlEntry = map(seqC(capture(yamlKey, "key"), char(":"), hSpaces, capture(yamlValue, "value")), ({ key, value }) => [key, value]);
|
|
64
|
+
// One terminated entry: `key: value\n` (or eof-terminated).
|
|
65
|
+
const entryLine = map(seqC(capture(yamlEntry, "entry"), newlineOrEof), ({ entry }) => entry);
|
|
66
|
+
const yamlBody = many(entryLine);
|
|
67
|
+
// --- frontmatter -------------------------------------------------------------
|
|
68
|
+
const fence = str("---");
|
|
69
|
+
const fenceLine = seqR(fence, optional(hSpaces), char("\n"));
|
|
70
|
+
const closingFence = seqR(fence, optional(hSpaces), newlineOrEof);
|
|
71
|
+
/**
|
|
72
|
+
* VitePress-style YAML frontmatter: a `---`-delimited block at the very top
|
|
73
|
+
* of a Markdown file.
|
|
74
|
+
*/
|
|
75
|
+
export const frontmatterParser = map(seqC(fenceLine, capture(yamlBody, "entries"), closingFence), ({ entries }) => {
|
|
76
|
+
const data = {};
|
|
77
|
+
for (const [k, v] of entries)
|
|
78
|
+
data[k] = v;
|
|
79
|
+
return { type: "frontmatter", data };
|
|
80
|
+
});
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
export * from "./types.js";
|
|
2
|
+
export * from "./inline.js";
|
|
3
|
+
export * from "./blocks.js";
|
|
4
|
+
export * from "./references.js";
|
|
5
|
+
export * from "./frontmatter.js";
|
|
6
|
+
import { seq, or, optional, many, capture, seqC, map } from "../../combinators.js";
|
|
7
|
+
import { spaces, newline } from "../../parsers.js";
|
|
8
|
+
import { headingParser, codeBlockParser, blockQuoteParser, paragraphParser, imageParser, horizontalRuleParser, setextHeadingParser, indentedCodeBlockParser, listParser, tableParser, htmlBlockParser, } from "./blocks.js";
|
|
9
|
+
import { linkDefinitionParser, footnoteDefinitionParser, resolveReferences, } from "./references.js";
|
|
10
|
+
import { frontmatterParser } from "./frontmatter.js";
|
|
11
|
+
const blockAlt = or(setextHeadingParser, horizontalRuleParser, headingParser, codeBlockParser, indentedCodeBlockParser, tableParser, blockQuoteParser, listParser, htmlBlockParser, linkDefinitionParser, footnoteDefinitionParser, paragraphParser, imageParser);
|
|
12
|
+
// A block followed by zero-or-more trailing newlines. Blocks differ in whether
|
|
13
|
+
// they consume their own terminating "\n" (e.g. headingParser does, codeBlock
|
|
14
|
+
// doesn't), so we can't use sepBy(many1(newline), block) — it would fail to
|
|
15
|
+
// separate two blocks when the first already ate its newline (e.g. a heading
|
|
16
|
+
// directly followed by a list with no intervening blank line).
|
|
17
|
+
const blockEntry = map(seqC(capture(blockAlt, "b"), many(newline)), ({ b }) => b);
|
|
18
|
+
const _markdownParser = seq([
|
|
19
|
+
optional(frontmatterParser),
|
|
20
|
+
optional(spaces),
|
|
21
|
+
many(blockEntry),
|
|
22
|
+
optional(spaces),
|
|
23
|
+
], (r) => {
|
|
24
|
+
const fm = r[0];
|
|
25
|
+
const blocks = r[2];
|
|
26
|
+
return fm ? [fm, ...blocks] : blocks;
|
|
27
|
+
});
|
|
28
|
+
// Resolve [id]: url definitions across the AST after parsing.
|
|
29
|
+
export const markdownParser = map(_markdownParser, (nodes) => resolveReferences(nodes));
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { Parser } from "../../types.js";
|
|
2
|
+
import { InlineMarkdown, InlineText, InlineBold, InlineItalic, InlineBoldItalic, InlineStrike, InlineHardBreak, InlineSoftBreak, InlineLink, InlineCode, Image, InlineRefLink, InlineRefImage, InlineFootnoteRef, InlineHTML } from "./types.js";
|
|
3
|
+
export declare const inlineTextParser: Parser<InlineText>;
|
|
4
|
+
/**
|
|
5
|
+
* Run `inlineMarkdownParser` repeatedly until `stop` would match at the
|
|
6
|
+
* current position. The `stop` parser is a lookahead — it is *not* consumed.
|
|
7
|
+
* Returns the list of inline nodes collected before `stop`.
|
|
8
|
+
*
|
|
9
|
+
* Used by every delimited inline parser (bold, italic, strike, link, …) so
|
|
10
|
+
* that the content between delimiters is a sequence of inline nodes rather
|
|
11
|
+
* than a flat string.
|
|
12
|
+
*/
|
|
13
|
+
export declare const inlineSeqUntil: (stop: Parser<unknown>) => Parser<InlineMarkdown[]>;
|
|
14
|
+
export declare const inlineBoldParser: Parser<InlineBold>;
|
|
15
|
+
export declare const inlineItalicParser: Parser<InlineItalic>;
|
|
16
|
+
export declare const inlineLinkParser: Parser<InlineLink>;
|
|
17
|
+
export declare const inlineCodeParser: Parser<InlineCode>;
|
|
18
|
+
export declare const inlineEscapeParser: Parser<InlineText>;
|
|
19
|
+
export declare const inlineBoldItalicParser: Parser<InlineBoldItalic>;
|
|
20
|
+
export declare const inlineBoldUnderscoreParser: Parser<InlineBold>;
|
|
21
|
+
export declare const inlineItalicUnderscoreParser: Parser<InlineItalic>;
|
|
22
|
+
export declare const urlAutolinkParser: Parser<InlineLink>;
|
|
23
|
+
export declare const emailAutolinkParser: Parser<InlineLink>;
|
|
24
|
+
export declare const autolinkParser: Parser<InlineLink>;
|
|
25
|
+
export declare const bareUrlAutolinkParser: Parser<InlineLink>;
|
|
26
|
+
export declare const htmlOpenTagParser: Parser<InlineHTML>;
|
|
27
|
+
export declare const htmlCloseTagParser: Parser<InlineHTML>;
|
|
28
|
+
export declare const htmlCommentParser: Parser<InlineHTML>;
|
|
29
|
+
export declare const htmlInlineParser: Parser<InlineHTML>;
|
|
30
|
+
export declare const inlineFootnoteRefParser: Parser<InlineFootnoteRef>;
|
|
31
|
+
export declare const inlineRefLinkParser: Parser<InlineRefLink>;
|
|
32
|
+
export declare const inlineRefImageParser: Parser<InlineRefImage>;
|
|
33
|
+
/** An inline image:  or . Lives in `inline.ts`
|
|
34
|
+
* so it can participate in paragraph parsing without `blocks.ts` becoming a
|
|
35
|
+
* circular dep. */
|
|
36
|
+
export declare const imageParser: Parser<Image>;
|
|
37
|
+
export declare const hardBreakParser: Parser<InlineHardBreak>;
|
|
38
|
+
/** A single `\n` that is *not* part of a blank line (which would terminate the
|
|
39
|
+
* enclosing paragraph). Hard breaks are matched earlier in `inlineMarkdownParser`'s
|
|
40
|
+
* `or` so a " \n" stays a hard break, never a soft one. */
|
|
41
|
+
export declare const softBreakParser: Parser<InlineSoftBreak>;
|
|
42
|
+
export declare const inlineStrikeParser: Parser<InlineStrike>;
|
|
43
|
+
export declare const htmlEntityParser: Parser<InlineText>;
|
|
44
|
+
/** Last-resort: consume a single delimiter char as literal text so unmatched
|
|
45
|
+
* delimiters (e.g. the `_` in snake_case_word, or a stray `*`) don't crash
|
|
46
|
+
* the paragraph. Matches one of the inline-text stop characters. */
|
|
47
|
+
export declare const inlineLiteralCharParser: Parser<InlineText>;
|
|
48
|
+
export declare const inlineMarkdownParser: Parser<InlineMarkdown>;
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
import { seqC, seqR, capture, captureCaptures, or, not, map, many, many1, many1Till, many1WithJoin, manyWithJoin, manyTillStr, iManyTillStr, count, exactly, lazy, } from "../../combinators.js";
|
|
2
|
+
import { str, char, eof, set, oneOf, alphanum, noneOf, digit, letter, anyChar } from "../../parsers.js";
|
|
3
|
+
import { success, failure } from "../../types.js";
|
|
4
|
+
import { optional, between } from "../../combinators.js";
|
|
5
|
+
// Stop inline-text at any single delimiter char OR at a hard-break sequence
|
|
6
|
+
// (" \n"+). Using many1Till with an `or` of delimiters makes the stop set
|
|
7
|
+
// composable rather than embedded inside a regex. `]` is included so that
|
|
8
|
+
// inline-text inside a link-text (`[...]`) terminates at the closing `]`.
|
|
9
|
+
const inlineTextStop = or(oneOf("*_`[]!<~\\&\n"), str(" "));
|
|
10
|
+
export const inlineTextParser = map(many1Till(inlineTextStop), (content) => ({ type: "inline-text", content }));
|
|
11
|
+
/**
|
|
12
|
+
* Run `inlineMarkdownParser` repeatedly until `stop` would match at the
|
|
13
|
+
* current position. The `stop` parser is a lookahead — it is *not* consumed.
|
|
14
|
+
* Returns the list of inline nodes collected before `stop`.
|
|
15
|
+
*
|
|
16
|
+
* Used by every delimited inline parser (bold, italic, strike, link, …) so
|
|
17
|
+
* that the content between delimiters is a sequence of inline nodes rather
|
|
18
|
+
* than a flat string.
|
|
19
|
+
*/
|
|
20
|
+
export const inlineSeqUntil = (stop) => many(map(seqC(not(stop), capture(lazy(() => inlineMarkdownParser), "node")), ({ node }) => node));
|
|
21
|
+
export const inlineBoldParser = map(seqC(str("**"), capture(inlineSeqUntil(str("**")), "content"), str("**")), ({ content }) => ({ type: "inline-bold", content: content }));
|
|
22
|
+
export const inlineItalicParser = map(seqC(not(str("**")), char("*"), capture(inlineSeqUntil(char("*")), "content"), char("*")), ({ content }) => ({ type: "inline-italic", content: content }));
|
|
23
|
+
/* URL + optional title used by both inline-link and inline-image parsers.
|
|
24
|
+
* `urlToken` is whitespace- and `)`-terminated. Empty destinations (`[a]()`)
|
|
25
|
+
* are allowed via `manyWithJoin` (zero-or-more). `titleClause` is an
|
|
26
|
+
* optional leading-space-separated `"..."` or `'...'`. Both are pure
|
|
27
|
+
* combinator-based so the link/image parsers can share them. */
|
|
28
|
+
const urlToken = manyWithJoin(noneOf(" \t\n)"));
|
|
29
|
+
const titleClause = map(seqC(many1(char(" ")), captureCaptures(or(seqC(char('"'), capture(manyTillStr('"'), "title"), char('"')), seqC(char("'"), capture(manyTillStr("'"), "title"), char("'"))))), ({ title }) => title);
|
|
30
|
+
export const inlineLinkParser = map(seqC(char("["), capture(inlineSeqUntil(char("]")), "content"), str("]("), capture(urlToken, "url"), capture(optional(titleClause), "title"), char(")")), ({ content, url, title }) => {
|
|
31
|
+
const link = {
|
|
32
|
+
type: "inline-link",
|
|
33
|
+
content: content,
|
|
34
|
+
url,
|
|
35
|
+
};
|
|
36
|
+
if (title != null)
|
|
37
|
+
link.title = title;
|
|
38
|
+
return link;
|
|
39
|
+
});
|
|
40
|
+
/* Multi-backtick code spans.
|
|
41
|
+
*
|
|
42
|
+
* `foo` → "foo"
|
|
43
|
+
* ``a`b`` → "a`b" (close on exactly N backticks)
|
|
44
|
+
* `` foo `` → "foo" (strip one space on each side when both)
|
|
45
|
+
* ` ` → " " (don't strip if content is all spaces)
|
|
46
|
+
*
|
|
47
|
+
* The opener is a run of N backticks; the closer is another run of *exactly*
|
|
48
|
+
* N backticks. Body atoms are either a single non-tick char or a tick run
|
|
49
|
+
* whose length is *not* N (so it can't be misread as the closer). The opener
|
|
50
|
+
* count threads into the closer via a small wrapper — every other piece is
|
|
51
|
+
* combinator-shaped. */
|
|
52
|
+
const tickRun = count(char("`"));
|
|
53
|
+
const tickRunOf = (n) => seqR(exactly(n, char("`")), or(not(char("`")), eof));
|
|
54
|
+
const codeBodyAtom = (n) => or(noneOf("`"), map(seqR(not(tickRunOf(n)), many1(char("`"))), (parts) => parts[1].join("")));
|
|
55
|
+
const codeBody = (n) => manyWithJoin(codeBodyAtom(n));
|
|
56
|
+
const stripCodeSpan = (s) => s.length >= 2 && s.startsWith(" ") && s.endsWith(" ") && s.trim().length > 0
|
|
57
|
+
? s.slice(1, -1)
|
|
58
|
+
: s;
|
|
59
|
+
export const inlineCodeParser = (input) => {
|
|
60
|
+
const opened = tickRun(input);
|
|
61
|
+
if (!opened.success)
|
|
62
|
+
return opened;
|
|
63
|
+
const n = opened.result;
|
|
64
|
+
const closed = map(seqR(codeBody(n), tickRunOf(n)), (parts) => stripCodeSpan(parts[0]))(opened.rest);
|
|
65
|
+
if (!closed.success) {
|
|
66
|
+
return failure("unmatched code span fence", input);
|
|
67
|
+
}
|
|
68
|
+
return success({ type: "inline-code", content: closed.result }, closed.rest);
|
|
69
|
+
};
|
|
70
|
+
const ESCAPABLE = "\\`*_{}[]()#+-.!~<>|";
|
|
71
|
+
export const inlineEscapeParser = seqC(set("type", "inline-text"), char("\\"), capture(oneOf(ESCAPABLE), "content"));
|
|
72
|
+
export const inlineBoldItalicParser = or(map(seqC(str("***"), capture(inlineSeqUntil(str("***")), "content"), str("***")), ({ content }) => ({
|
|
73
|
+
type: "inline-bold-italic",
|
|
74
|
+
content: content,
|
|
75
|
+
})), map(seqC(str("___"), capture(inlineSeqUntil(str("___")), "content"), str("___")), ({ content }) => ({
|
|
76
|
+
type: "inline-bold-italic",
|
|
77
|
+
content: content,
|
|
78
|
+
})));
|
|
79
|
+
export const inlineBoldUnderscoreParser = map(seqC(str("__"), capture(inlineSeqUntil(str("__")), "content"), str("__"), not(alphanum)), ({ content }) => ({ type: "inline-bold", content: content }));
|
|
80
|
+
export const inlineItalicUnderscoreParser = map(seqC(not(str("__")), char("_"), capture(inlineSeqUntil(char("_")), "content"), char("_"), not(alphanum)), ({ content }) => ({ type: "inline-italic", content: content }));
|
|
81
|
+
// URL body inside <...>: http(s)://<non-space, non-< or >>
|
|
82
|
+
const urlBody = map(seqR(str("http"), or(str("s"), str("")), str("://"), many1WithJoin(noneOf(" \t\n<>"))), (parts) => parts.join(""));
|
|
83
|
+
// Email body: local@domain.tld — no spaces, no < > or duplicates of @ inside parts
|
|
84
|
+
const emailPart = many1WithJoin(noneOf(" \t\n<>@."));
|
|
85
|
+
const emailBody = map(seqR(emailPart, char("@"), emailPart, char("."), emailPart), (parts) => parts.join(""));
|
|
86
|
+
// Wrap a literal string as the single-text content array used by InlineLink.
|
|
87
|
+
const asTextContent = (s) => [
|
|
88
|
+
{ type: "inline-text", content: s },
|
|
89
|
+
];
|
|
90
|
+
export const urlAutolinkParser = map(seqC(char("<"), capture(urlBody, "url"), char(">")), ({ url }) => ({ type: "inline-link", content: asTextContent(url), url }));
|
|
91
|
+
export const emailAutolinkParser = map(seqC(char("<"), capture(emailBody, "email"), char(">")), ({ email }) => ({
|
|
92
|
+
type: "inline-link",
|
|
93
|
+
content: asTextContent(email),
|
|
94
|
+
url: `mailto:${email}`,
|
|
95
|
+
}));
|
|
96
|
+
export const autolinkParser = or(urlAutolinkParser, emailAutolinkParser);
|
|
97
|
+
/* Bare-URL GFM autolinks: `http(s)://…` without surrounding `<>`. The body
|
|
98
|
+
* is built from three kinds of atom so the punctuation/paren-balance rules
|
|
99
|
+
* fall out of combinator composition:
|
|
100
|
+
* - `bareUrlParenGroup` — a balanced `(...)` (recursive via `lazy`), so
|
|
101
|
+
* Wikipedia-style URLs like `…Lisp_(programming_language)` keep their
|
|
102
|
+
* parens and an unmatched trailing `)` falls through to the surrounding
|
|
103
|
+
* text;
|
|
104
|
+
* - `bareUrlPunctMidway` — one of `.,!?;:` accepted *only* when at least
|
|
105
|
+
* one non-punct atom follows, so trailing sentence punctuation stays
|
|
106
|
+
* in the surrounding text (a `not(...)` lookahead does the work);
|
|
107
|
+
* - `bareUrlNormalChar` — any other URL char.
|
|
108
|
+
*
|
|
109
|
+
* `urlBodyStop` is the lookahead set that ends a URL outside a paren group:
|
|
110
|
+
* whitespace, `<`, `>`, `)`, or end-of-input. */
|
|
111
|
+
const bareUrlScheme = map(seqC(capture(str("http"), "scheme"), capture(optional(char("s")), "s"), str("://")), ({ scheme, s }) => scheme + (s !== null && s !== void 0 ? s : "") + "://");
|
|
112
|
+
const urlBodyStop = or(oneOf(" \t\n<>)"), eof);
|
|
113
|
+
const urlTrailingPunct = oneOf(".,!?;:");
|
|
114
|
+
const bareUrlNormalChar = noneOf(" \t\n<>().,!?;:");
|
|
115
|
+
const bareUrlPunctMidway = map(seqC(capture(urlTrailingPunct, "p"),
|
|
116
|
+
// Reject if the remainder is just more punct then a URL stop — that
|
|
117
|
+
// would mean this `.` (or `,`/`!`/etc) is part of a trailing run.
|
|
118
|
+
not(seqR(many(urlTrailingPunct), urlBodyStop))), ({ p }) => p);
|
|
119
|
+
const bareUrlAtom = lazy(() => or(bareUrlParenGroup, bareUrlPunctMidway, bareUrlNormalChar));
|
|
120
|
+
const bareUrlParenGroup = map(seqC(capture(char("("), "open"), capture(manyWithJoin(bareUrlAtom), "inner"), capture(char(")"), "close")), ({ open, inner, close }) => open + inner + close);
|
|
121
|
+
const bareUrlBody = many1WithJoin(bareUrlAtom);
|
|
122
|
+
export const bareUrlAutolinkParser = map(seqC(capture(bareUrlScheme, "scheme"), capture(bareUrlBody, "body")), ({ scheme, body }) => {
|
|
123
|
+
const url = scheme + body;
|
|
124
|
+
return {
|
|
125
|
+
type: "inline-link",
|
|
126
|
+
content: [{ type: "inline-text", content: url }],
|
|
127
|
+
url,
|
|
128
|
+
};
|
|
129
|
+
});
|
|
130
|
+
/* Inline HTML passthrough.
|
|
131
|
+
*
|
|
132
|
+
* Three CommonMark shapes are supported (each in its own exported parser):
|
|
133
|
+
* - open / self-closing tags: `<a>`, `<a href="x">`, `<br/>`,
|
|
134
|
+
* - close tags: `</a>`, `</a >`,
|
|
135
|
+
* - comments: `<!-- … -->`.
|
|
136
|
+
*
|
|
137
|
+
* The output is always an `InlineHTML` node whose `content` is the raw source
|
|
138
|
+
* (including the angle brackets), so downstream renderers pass it through
|
|
139
|
+
* untouched. We do not try to balance opens/closes or sanitise anything.
|
|
140
|
+
*
|
|
141
|
+
* The pieces below are shared helpers: `htmlTagName`, `htmlAttribute`,
|
|
142
|
+
* `htmlAttributes`, `htmlWS`. All built from combinators with named
|
|
143
|
+
* captures so the reconstructed string mirrors the source exactly. */
|
|
144
|
+
const htmlWS = manyWithJoin(oneOf(" \t\n"));
|
|
145
|
+
const htmlWS1 = many1WithJoin(oneOf(" \t\n"));
|
|
146
|
+
const htmlTagName = map(seqC(capture(letter, "first"), capture(manyWithJoin(or(alphanum, char("-"))), "rest")), ({ first, rest }) => first + rest);
|
|
147
|
+
const htmlAttrName = map(seqC(capture(or(letter, char("_"), char(":")), "first"), capture(manyWithJoin(or(alphanum, oneOf("_:.-"))), "rest")), ({ first, rest }) => first + rest);
|
|
148
|
+
const dqAttrValue = map(seqC(char('"'), capture(manyTillStr('"'), "v"), char('"')), ({ v }) => `"${v}"`);
|
|
149
|
+
const sqAttrValue = map(seqC(char("'"), capture(manyTillStr("'"), "v"), char("'")), ({ v }) => `'${v}'`);
|
|
150
|
+
const unquotedAttrValue = many1WithJoin(noneOf(" \t\n\"'=<>`"));
|
|
151
|
+
const htmlAttrValue = or(dqAttrValue, sqAttrValue, unquotedAttrValue);
|
|
152
|
+
/* Optional `= value` suffix on an attribute. Whitespace is allowed on both
|
|
153
|
+
* sides of the `=` per CommonMark. */
|
|
154
|
+
const htmlAttrEq = map(seqC(capture(htmlWS, "wsBefore"), char("="), capture(htmlWS, "wsAfter"), capture(htmlAttrValue, "v")), ({ wsBefore, wsAfter, v }) => `${wsBefore}=${wsAfter}${v}`);
|
|
155
|
+
const htmlAttribute = map(seqC(capture(htmlAttrName, "name"), capture(optional(htmlAttrEq), "eq")), ({ name, eq }) => name + (eq !== null && eq !== void 0 ? eq : ""));
|
|
156
|
+
/* Zero or more attributes, each separated from the previous token by
|
|
157
|
+
* at least one whitespace char. Returns the joined source (including the
|
|
158
|
+
* separating whitespace) so the outer parser can reconstruct the original. */
|
|
159
|
+
const htmlAttributes = manyWithJoin(map(seqC(capture(htmlWS1, "ws"), capture(htmlAttribute, "attr")), ({ ws, attr }) => ws + attr));
|
|
160
|
+
export const htmlOpenTagParser = map(seqC(char("<"), capture(htmlTagName, "name"), capture(htmlAttributes, "attrs"), capture(htmlWS, "ws"), capture(optional(char("/")), "selfClose"), char(">")), ({ name, attrs, ws, selfClose }) => ({
|
|
161
|
+
type: "inline-html",
|
|
162
|
+
content: `<${name}${attrs}${ws}${selfClose !== null && selfClose !== void 0 ? selfClose : ""}>`,
|
|
163
|
+
}));
|
|
164
|
+
export const htmlCloseTagParser = map(seqC(str("</"), capture(htmlTagName, "name"), capture(htmlWS, "ws"), char(">")), ({ name, ws }) => ({
|
|
165
|
+
type: "inline-html",
|
|
166
|
+
content: `</${name}${ws}>`,
|
|
167
|
+
}));
|
|
168
|
+
/* HTML comments: `<!-- … -->`. CommonMark rules:
|
|
169
|
+
* - the body may not contain `--`,
|
|
170
|
+
* - the body may not start or end with `>`.
|
|
171
|
+
*
|
|
172
|
+
* Expressed as pure combinators by baking the constraints into the body atom:
|
|
173
|
+
* - `not(str("-->"))` so we stop cleanly at the closer,
|
|
174
|
+
* - `not(str("--"))` rejects a `--` mid-body,
|
|
175
|
+
* - `not(seqR(char(">"), str("-->")))` is the "end-of-body `>` " rule —
|
|
176
|
+
* a `>` directly before the closer is rejected, since accepting it
|
|
177
|
+
* would let the comment end on `>`.
|
|
178
|
+
*
|
|
179
|
+
* The start-of-body `>` rule is enforced with one `not(char(">"))` placed
|
|
180
|
+
* before the body's `many1`. An empty body falls through to `optional`'s
|
|
181
|
+
* null branch, which leaves the input unconsumed so the closer can match
|
|
182
|
+
* immediately. */
|
|
183
|
+
const commentBodyChar = map(seqC(not(str("-->")), not(str("--")), not(seqR(char(">"), str("-->"))), capture(anyChar, "c")), ({ c }) => c);
|
|
184
|
+
const commentBody = map(optional(map(seqC(not(char(">")), capture(many1WithJoin(commentBodyChar), "body")), ({ body }) => body)), (body) => body !== null && body !== void 0 ? body : "");
|
|
185
|
+
export const htmlCommentParser = map(seqC(str("<!--"), capture(commentBody, "body"), str("-->")), ({ body }) => ({
|
|
186
|
+
type: "inline-html",
|
|
187
|
+
content: `<!--${body}-->`,
|
|
188
|
+
}));
|
|
189
|
+
/* Inline HTML dispatch. `htmlCommentParser` runs first so `<!--…-->` isn't
|
|
190
|
+
* stolen by `htmlOpenTagParser` (which would otherwise see `<!` and bail).
|
|
191
|
+
* `htmlCloseTagParser` runs before `htmlOpenTagParser` because the open-tag
|
|
192
|
+
* parser would accept `<` followed by a tag name and we want `</a>` to win
|
|
193
|
+
* over an attempted `<` + `/a` (which isn't a valid attribute shape anyway). */
|
|
194
|
+
export const htmlInlineParser = or(htmlCommentParser, htmlCloseTagParser, htmlOpenTagParser);
|
|
195
|
+
// Footnote reference: `[^id]` (id has no `]`, `\n`, or spaces).
|
|
196
|
+
export const inlineFootnoteRefParser = seqC(set("type", "inline-footnote-ref"), str("[^"), capture(many1WithJoin(noneOf("] \n\t")), "id"), char("]"));
|
|
197
|
+
// `[...]` where ... is one or more characters that aren't `]` or newline.
|
|
198
|
+
const bracketed = between(char("["), char("]"), noneOf("]\n"));
|
|
199
|
+
const bracketedAsString = map(bracketed, (chars) => chars.join(""));
|
|
200
|
+
export const inlineRefLinkParser = map(seqC(capture(bracketedAsString, "text"), capture(optional(bracketedAsString), "rawId"), not(char("(")) // disambiguate from inline link
|
|
201
|
+
), ({ text, rawId }) => ({
|
|
202
|
+
type: "inline-ref-link",
|
|
203
|
+
text,
|
|
204
|
+
id: rawId && rawId.length > 0 ? rawId : text,
|
|
205
|
+
}));
|
|
206
|
+
export const inlineRefImageParser = map(seqC(char("!"), capture(bracketedAsString, "alt"), capture(optional(bracketedAsString), "rawId"), not(char("("))), ({ alt, rawId }) => ({
|
|
207
|
+
type: "inline-ref-image",
|
|
208
|
+
alt,
|
|
209
|
+
id: rawId && rawId.length > 0 ? rawId : alt,
|
|
210
|
+
}));
|
|
211
|
+
/** An inline image:  or . Lives in `inline.ts`
|
|
212
|
+
* so it can participate in paragraph parsing without `blocks.ts` becoming a
|
|
213
|
+
* circular dep. */
|
|
214
|
+
export const imageParser = map(seqC(str(", "alt"), str("]("), capture(urlToken, "url"), capture(optional(titleClause), "title"), char(")")), ({ alt, url, title }) => {
|
|
215
|
+
const img = { type: "image", alt, url };
|
|
216
|
+
if (title != null)
|
|
217
|
+
img.title = title;
|
|
218
|
+
return img;
|
|
219
|
+
});
|
|
220
|
+
export const hardBreakParser = map(or(
|
|
221
|
+
// two-or-more trailing spaces then newline
|
|
222
|
+
seqR(str(" "), many(char(" ")), char("\n")),
|
|
223
|
+
// backslash then newline
|
|
224
|
+
seqR(char("\\"), char("\n"))), () => ({ type: "inline-hard-break" }));
|
|
225
|
+
/** A single `\n` that is *not* part of a blank line (which would terminate the
|
|
226
|
+
* enclosing paragraph). Hard breaks are matched earlier in `inlineMarkdownParser`'s
|
|
227
|
+
* `or` so a " \n" stays a hard break, never a soft one. */
|
|
228
|
+
export const softBreakParser = map(seqR(char("\n"), not(char("\n"))), () => ({ type: "inline-soft-break" }));
|
|
229
|
+
export const inlineStrikeParser = map(seqC(str("~~"), capture(inlineSeqUntil(str("~~")), "content"), str("~~")), ({ content }) => ({
|
|
230
|
+
type: "inline-strike",
|
|
231
|
+
content: content,
|
|
232
|
+
}));
|
|
233
|
+
/* HTML entities. Decodes:
|
|
234
|
+
* - the five XML-core named entities (`&`, `<`, `>`, `"`,
|
|
235
|
+
* `'`) into their literal characters,
|
|
236
|
+
* - decimal numeric references (`&#NN;`),
|
|
237
|
+
* - hexadecimal numeric references (`&#xNN;` / `&#XNN;`).
|
|
238
|
+
*
|
|
239
|
+
* Unknown named entities (e.g. `&unknown;`) fail this parser and fall
|
|
240
|
+
* through to `inlineLiteralCharParser`, which emits a literal `&`. */
|
|
241
|
+
const namedEntity = or(map(str("&"), () => "&"), map(str("<"), () => "<"), map(str(">"), () => ">"), map(str("""), () => '"'), map(str("'"), () => "'"));
|
|
242
|
+
const decimalEntity = map(seqC(str("&#"), capture(many1WithJoin(digit), "digits"), char(";")), ({ digits }) => String.fromCodePoint(parseInt(digits, 10)));
|
|
243
|
+
const hexEntity = map(seqC(or(str("&#x"), str("&#X")), capture(many1WithJoin(oneOf("0123456789abcdefABCDEF")), "digits"), char(";")), ({ digits }) => String.fromCodePoint(parseInt(digits, 16)));
|
|
244
|
+
export const htmlEntityParser = map(or(hexEntity, decimalEntity, namedEntity), (content) => ({ type: "inline-text", content }));
|
|
245
|
+
/** Last-resort: consume a single delimiter char as literal text so unmatched
|
|
246
|
+
* delimiters (e.g. the `_` in snake_case_word, or a stray `*`) don't crash
|
|
247
|
+
* the paragraph. Matches one of the inline-text stop characters. */
|
|
248
|
+
export const inlineLiteralCharParser = seqC(set("type", "inline-text"), capture(oneOf("*_`[]!<~\\&"), "content"));
|
|
249
|
+
export const inlineMarkdownParser = or(hardBreakParser, inlineEscapeParser, inlineBoldItalicParser, inlineBoldParser, inlineItalicParser, inlineBoldUnderscoreParser, inlineItalicUnderscoreParser, inlineStrikeParser, autolinkParser, bareUrlAutolinkParser, htmlInlineParser, imageParser, inlineRefImageParser, inlineFootnoteRefParser, inlineLinkParser, inlineRefLinkParser, inlineCodeParser, htmlEntityParser, inlineTextParser, inlineLiteralCharParser);
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import { Parser } from "../../types.js";
|
|
2
|
+
import { LinkDef, FootnoteDef } from "./types.js";
|
|
3
|
+
export declare const linkDefinitionParser: Parser<LinkDef>;
|
|
4
|
+
export declare const footnoteDefinitionParser: Parser<FootnoteDef>;
|
|
5
|
+
export declare function resolveReferences(ast: unknown[]): unknown[];
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import { seqC, capture, optional, many1WithJoin, map, } from "../../combinators.js";
|
|
2
|
+
import { char, str, set, noneOf, spaces } from "../../parsers.js";
|
|
3
|
+
/* Reference link definitions.
|
|
4
|
+
*
|
|
5
|
+
* [id]: url
|
|
6
|
+
* [id]: url "title"
|
|
7
|
+
*
|
|
8
|
+
* Built entirely from combinators. The optional title is `seqR(spaces, "..."`
|
|
9
|
+
* unwrapped via `map`. */
|
|
10
|
+
const idChars = many1WithJoin(noneOf("]\n"));
|
|
11
|
+
const urlChars = many1WithJoin(noneOf(" \t\n"));
|
|
12
|
+
const titleChars = many1WithJoin(noneOf('"\n'));
|
|
13
|
+
const titleParser = map(seqC(spaces, char('"'), capture(titleChars, "title"), char('"')), ({ title }) => title);
|
|
14
|
+
export const linkDefinitionParser = seqC(set("type", "link-definition"), char("["), capture(idChars, "id"), str("]:"), spaces, capture(urlChars, "url"), optional(capture(titleParser, "title")));
|
|
15
|
+
/* Footnote definitions: `[^id]: text` on a single line. */
|
|
16
|
+
export const footnoteDefinitionParser = seqC(set("type", "footnote-definition"), str("[^"), capture(many1WithJoin(noneOf("] \n\t")), "id"), str("]:"), spaces, capture(many1WithJoin(noneOf("\n")), "content"));
|
|
17
|
+
/* Resolution pass.
|
|
18
|
+
*
|
|
19
|
+
* Walk the AST. Collect link-definitions, then rewrite ref nodes to inline
|
|
20
|
+
* links/images and strip the definitions. Id matching is case-insensitive. */
|
|
21
|
+
export function resolveReferences(ast) {
|
|
22
|
+
const linkDefs = new Map();
|
|
23
|
+
const footnoteDefs = new Map();
|
|
24
|
+
for (const node of ast) {
|
|
25
|
+
if (!isObj(node))
|
|
26
|
+
continue;
|
|
27
|
+
const t = node.type;
|
|
28
|
+
if (t === "link-definition") {
|
|
29
|
+
const def = node;
|
|
30
|
+
linkDefs.set(def.id.toLowerCase(), def);
|
|
31
|
+
}
|
|
32
|
+
else if (t === "footnote-definition") {
|
|
33
|
+
const def = node;
|
|
34
|
+
footnoteDefs.set(def.id.toLowerCase(), def);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
function walk(node) {
|
|
38
|
+
if (Array.isArray(node))
|
|
39
|
+
return node.map(walk);
|
|
40
|
+
if (!isObj(node))
|
|
41
|
+
return node;
|
|
42
|
+
const obj = node;
|
|
43
|
+
if (obj.type === "inline-ref-link") {
|
|
44
|
+
const def = linkDefs.get(String(obj.id).toLowerCase());
|
|
45
|
+
if (def) {
|
|
46
|
+
const link = {
|
|
47
|
+
type: "inline-link",
|
|
48
|
+
content: [{ type: "inline-text", content: String(obj.text) }],
|
|
49
|
+
url: def.url,
|
|
50
|
+
};
|
|
51
|
+
if (def.title != null)
|
|
52
|
+
link.title = def.title;
|
|
53
|
+
return link;
|
|
54
|
+
}
|
|
55
|
+
return { type: "inline-text", content: `[${obj.text}]` };
|
|
56
|
+
}
|
|
57
|
+
if (obj.type === "inline-ref-image") {
|
|
58
|
+
const def = linkDefs.get(String(obj.id).toLowerCase());
|
|
59
|
+
if (def) {
|
|
60
|
+
const img = {
|
|
61
|
+
type: "image",
|
|
62
|
+
url: def.url,
|
|
63
|
+
alt: obj.alt,
|
|
64
|
+
};
|
|
65
|
+
if (def.title != null)
|
|
66
|
+
img.title = def.title;
|
|
67
|
+
return img;
|
|
68
|
+
}
|
|
69
|
+
return { type: "inline-text", content: `![${obj.alt}]` };
|
|
70
|
+
}
|
|
71
|
+
if (obj.type === "inline-footnote-ref") {
|
|
72
|
+
const def = footnoteDefs.get(String(obj.id).toLowerCase());
|
|
73
|
+
if (def) {
|
|
74
|
+
return { type: "inline-footnote-ref", id: obj.id, content: def.content };
|
|
75
|
+
}
|
|
76
|
+
return { type: "inline-text", content: `[^${obj.id}]` };
|
|
77
|
+
}
|
|
78
|
+
// recurse into known child-bearing fields
|
|
79
|
+
const out = Object.assign({}, obj);
|
|
80
|
+
for (const key of ["content", "items", "rows"]) {
|
|
81
|
+
if (Array.isArray(obj[key]))
|
|
82
|
+
out[key] = obj[key].map(walk);
|
|
83
|
+
}
|
|
84
|
+
if (obj.sublist)
|
|
85
|
+
out.sublist = walk(obj.sublist);
|
|
86
|
+
return out;
|
|
87
|
+
}
|
|
88
|
+
return ast
|
|
89
|
+
.filter((n) => !(isObj(n) &&
|
|
90
|
+
(n.type === "link-definition" ||
|
|
91
|
+
n.type === "footnote-definition")))
|
|
92
|
+
.map(walk);
|
|
93
|
+
}
|
|
94
|
+
function isObj(v) {
|
|
95
|
+
return typeof v === "object" && v !== null;
|
|
96
|
+
}
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
export type InlineMarkdown = InlineText | InlineSoftBreak | InlineBold | InlineItalic | InlineBoldItalic | InlineStrike | InlineHardBreak | InlineLink | InlineCode | Image | InlineRefLink | InlineRefImage | InlineFootnoteRef | InlineHTML;
|
|
2
|
+
export type InlineHTML = {
|
|
3
|
+
type: "inline-html";
|
|
4
|
+
/** Raw passthrough source including angle brackets. */
|
|
5
|
+
content: string;
|
|
6
|
+
};
|
|
7
|
+
export type InlineText = {
|
|
8
|
+
type: "inline-text";
|
|
9
|
+
content: string;
|
|
10
|
+
};
|
|
11
|
+
export type InlineBold = {
|
|
12
|
+
type: "inline-bold";
|
|
13
|
+
content: InlineMarkdown[];
|
|
14
|
+
};
|
|
15
|
+
export type InlineItalic = {
|
|
16
|
+
type: "inline-italic";
|
|
17
|
+
content: InlineMarkdown[];
|
|
18
|
+
};
|
|
19
|
+
export type InlineBoldItalic = {
|
|
20
|
+
type: "inline-bold-italic";
|
|
21
|
+
content: InlineMarkdown[];
|
|
22
|
+
};
|
|
23
|
+
export type InlineStrike = {
|
|
24
|
+
type: "inline-strike";
|
|
25
|
+
content: InlineMarkdown[];
|
|
26
|
+
};
|
|
27
|
+
export type InlineHardBreak = {
|
|
28
|
+
type: "inline-hard-break";
|
|
29
|
+
};
|
|
30
|
+
export type InlineSoftBreak = {
|
|
31
|
+
type: "inline-soft-break";
|
|
32
|
+
};
|
|
33
|
+
export type InlineLink = {
|
|
34
|
+
type: "inline-link";
|
|
35
|
+
content: InlineMarkdown[];
|
|
36
|
+
url: string;
|
|
37
|
+
title?: string;
|
|
38
|
+
};
|
|
39
|
+
export type InlineCode = {
|
|
40
|
+
type: "inline-code";
|
|
41
|
+
content: string;
|
|
42
|
+
};
|
|
43
|
+
export type Paragraph = {
|
|
44
|
+
type: "paragraph";
|
|
45
|
+
content: InlineMarkdown[];
|
|
46
|
+
};
|
|
47
|
+
export type Heading = {
|
|
48
|
+
type: "heading";
|
|
49
|
+
level: number;
|
|
50
|
+
content: InlineMarkdown[];
|
|
51
|
+
};
|
|
52
|
+
export type CodeBlock = {
|
|
53
|
+
type: "code-block";
|
|
54
|
+
content: string;
|
|
55
|
+
language: string | null;
|
|
56
|
+
};
|
|
57
|
+
export type BlockQuoteContent = InlineMarkdown | BlockQuote;
|
|
58
|
+
export type BlockQuote = {
|
|
59
|
+
type: "block-quote";
|
|
60
|
+
content: BlockQuoteContent[];
|
|
61
|
+
};
|
|
62
|
+
export type Image = {
|
|
63
|
+
type: "image";
|
|
64
|
+
url: string;
|
|
65
|
+
alt: string;
|
|
66
|
+
title?: string;
|
|
67
|
+
};
|
|
68
|
+
export type InlineRefLink = {
|
|
69
|
+
type: "inline-ref-link";
|
|
70
|
+
text: string;
|
|
71
|
+
id: string;
|
|
72
|
+
};
|
|
73
|
+
export type InlineRefImage = {
|
|
74
|
+
type: "inline-ref-image";
|
|
75
|
+
alt: string;
|
|
76
|
+
id: string;
|
|
77
|
+
};
|
|
78
|
+
export type ListItem = {
|
|
79
|
+
content: InlineMarkdown[];
|
|
80
|
+
sublist?: List;
|
|
81
|
+
/** GFM task-list state: `true` for `[x]`/`[X]`, `false` for `[ ]`, absent for plain items. */
|
|
82
|
+
checked?: boolean;
|
|
83
|
+
};
|
|
84
|
+
export type List = {
|
|
85
|
+
type: "list";
|
|
86
|
+
ordered: boolean;
|
|
87
|
+
start: number;
|
|
88
|
+
items: ListItem[];
|
|
89
|
+
};
|
|
90
|
+
export type HorizontalRule = {
|
|
91
|
+
type: "horizontal-rule";
|
|
92
|
+
};
|
|
93
|
+
export type Alignment = "left" | "right" | "center" | null;
|
|
94
|
+
export type Table = {
|
|
95
|
+
type: "table";
|
|
96
|
+
headers: string[];
|
|
97
|
+
alignments: Alignment[];
|
|
98
|
+
rows: string[][];
|
|
99
|
+
};
|
|
100
|
+
export type LinkDef = {
|
|
101
|
+
type: "link-definition";
|
|
102
|
+
id: string;
|
|
103
|
+
url: string;
|
|
104
|
+
title?: string;
|
|
105
|
+
};
|
|
106
|
+
export type InlineFootnoteRef = {
|
|
107
|
+
type: "inline-footnote-ref";
|
|
108
|
+
id: string;
|
|
109
|
+
/** Filled in by `resolveReferences` when a matching FootnoteDef exists. */
|
|
110
|
+
content?: string;
|
|
111
|
+
};
|
|
112
|
+
export type FootnoteDef = {
|
|
113
|
+
type: "footnote-definition";
|
|
114
|
+
id: string;
|
|
115
|
+
content: string;
|
|
116
|
+
};
|
|
117
|
+
export type HTMLBlock = {
|
|
118
|
+
type: "html-block";
|
|
119
|
+
content: string;
|
|
120
|
+
};
|
|
121
|
+
export type FrontmatterValue = string | number | boolean | null | FrontmatterValue[];
|
|
122
|
+
export type Frontmatter = {
|
|
123
|
+
type: "frontmatter";
|
|
124
|
+
data: Record<string, FrontmatterValue>;
|
|
125
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "tarsec",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.1",
|
|
4
4
|
"description": "A parser combinator library for TypeScript, inspired by Parsec.",
|
|
5
5
|
"homepage": "https://github.com/egonSchiele/tarsec",
|
|
6
6
|
"scripts": {
|
|
@@ -19,6 +19,11 @@
|
|
|
19
19
|
".": {
|
|
20
20
|
"import": "./dist/index.js",
|
|
21
21
|
"require": "./dist/index.js"
|
|
22
|
+
},
|
|
23
|
+
"./parsers/markdown": {
|
|
24
|
+
"import": "./dist/parsers/markdown/index.js",
|
|
25
|
+
"require": "./dist/parsers/markdown/index.js",
|
|
26
|
+
"types": "./dist/parsers/markdown/index.d.ts"
|
|
22
27
|
}
|
|
23
28
|
},
|
|
24
29
|
"type": "module",
|